In [1]:
    
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd
import pandas_profiling
    
Then, read the (sample) input tables
In [2]:
    
# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'
# Get the paths of the input tables
path_A = datasets_dir + os.sep + 'dblp_demo.csv'
    
In [4]:
    
# Read the CSV files and set 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='id')
A.head()
    
    
Metadata file is not present in the given path; proceeding to read the csv file.
    Out[4]:
  
    
       
      id 
      title 
      authors 
      venue 
      year 
     
  
  
    
      0 
      l0 
      Paradise: A Database System for GIS Applications 
      Paradise Team 
      SIGMOD Conference 
      1995 
     
    
      1 
      l1 
      A Query Language and Optimization Techniques for Unstructured Data 
      Gerd G. Hillebrand, Peter Buneman, Susan B. Davidson, Dan Suciu 
      SIGMOD Conference 
      1996 
     
    
      2 
      l2 
      Turbo-charging Vertical Mining of Large Databases 
      Jayant R. Haritsa, Devavrat Shah, S. Sudarshan, Pradeep Shenoy, Mayank Bawa, Gaurav Bhalotia 
      SIGMOD Conference 
      2000 
     
    
      3 
      l3 
      Maintenance of Data Cubes and Summary Tables in a Warehouse 
      Inderpal Singh Mumick, Dallan Quass, Barinderpal Singh Mumick 
      SIGMOD Conference 
      1997 
     
    
      4 
      l4 
      On Relational Support for XML Publishing: Beyond Sorting and Tagging 
      Raghav Kaushik, Jeffrey F. Naughton, Surajit Chaudhuri 
      SIGMOD Conference 
      2003 
     
  
In [5]:
    
pandas_profiling.ProfileReport(A)
    
    Out[5]:
    
        Overview
    
    
    
        Dataset info
        
            
            
                Number of variables 
                5  
             
            
                Number of observations 
                1800  
             
            
                Total Missing (%) 
                0.0%  
             
            
                Total size in memory 
                70.4 KiB  
             
            
                Average record size in memory 
                40.0 B  
             
            
        
    
    
        Variables types
        
            
            
                Numeric 
                1  
             
            
                Categorical 
                3  
             
            
                Date 
                0  
             
            
                Text (Unique) 
                1  
             
            
                Rejected 
                0  
             
            
        
    
    
        Warnings
        authors has a high cardinality: 1703 distinct values  Warningtitle has a high cardinality: 1797 distinct values  Warning 
    
    
        Variables
    
    
    
        authors
            Categorical
        
    
    
        
            Distinct count 
            1703 
         
        
            Unique (%) 
            94.6% 
         
        
            Missing (%) 
            0.0% 
         
        
            Missing (n) 
            0 
         
    
    
        
    Dan Suciu 
    
        
             
        
        7
     
 
    C. Mohan 
    
        
             
        
        6
     
 
    Andrew Eisenberg, Jim Melton 
    
        
             
        
        5
     
 
    Other values (1700) 
    
        
            1782
        
        
     
 
    
    
    
    
        Value 
        Count 
        Frequency (%) 
          
     
    
    
        Dan Suciu 
        7 
        0.4% 
        
             
         
 
        C. Mohan 
        6 
        0.3% 
        
             
         
 
        Andrew Eisenberg, Jim Melton 
        5 
        0.3% 
        
             
         
 
        Xiaolei Qian 
        5 
        0.3% 
        
             
         
 
        Joseph M. Hellerstein 
        4 
        0.2% 
        
             
         
 
        Richard T. Snodgrass 
        4 
        0.2% 
        
             
         
 
        Praveen Seshadri 
        3 
        0.2% 
        
             
         
 
        H. V. Jagadish 
        3 
        0.2% 
        
             
         
 
        Nam Huyn 
        3 
        0.2% 
        
             
         
 
        Viswanath Poosala, Yannis E. Ioannidis 
        3 
        0.2% 
        
             
         
 
        Other values (1693) 
        1757 
        97.6% 
        
             
         
 
    
        id
            Categorical, Unique
        
    
  
    
      First 3 values 
     
  
  
    
      l415 
     
    
      l1574 
     
    
      l1364 
     
  
  
    
      Last 3 values 
     
  
  
    
      l492 
     
    
      l273 
     
    
      l92 
     
  
    First 10 values
    
    
    
        Value 
        Count 
        Frequency (%) 
          
     
    
    
        l0 
        1 
        0.1% 
        
             
         
 
        l1 
        1 
        0.1% 
        
             
         
 
        l10 
        1 
        0.1% 
        
             
         
 
        l100 
        1 
        0.1% 
        
             
         
 
        l1000 
        1 
        0.1% 
        
             
         
 
    Last 10 values
    
    
    
        Value 
        Count 
        Frequency (%) 
          
     
    
    
        l995 
        1 
        0.1% 
        
             
         
 
        l996 
        1 
        0.1% 
        
             
         
 
        l997 
        1 
        0.1% 
        
             
         
 
        l998 
        1 
        0.1% 
        
             
         
 
        l999 
        1 
        0.1% 
        
             
         
 
    
        title
            Categorical
        
    
    
        
            Distinct count 
            1797 
         
        
            Unique (%) 
            99.8% 
         
        
            Missing (%) 
            0.0% 
         
        
            Missing (n) 
            0 
         
    
    
        
    Editorial 
    
        
             
        
        2
     
 
    Guest editorial 
    
        
             
        
        2
     
 
    Keynote Address 
    
        
             
        
        2
     
 
    Other values (1794) 
    
        
            1794
        
        
     
 
    
    
    
    
        Value 
        Count 
        Frequency (%) 
          
     
    
    
        Editorial 
        2 
        0.1% 
        
             
         
 
        Guest editorial 
        2 
        0.1% 
        
             
         
 
        Keynote Address 
        2 
        0.1% 
        
             
         
 
        Integrating Modelling Systems for Environmental Management Information Systems 
        1 
        0.1% 
        
             
         
 
        Historical Queries Along Multiple Lines of Time Evolution 
        1 
        0.1% 
        
             
         
 
        Selectivity Estimation Without the Attribute Value Independence Assumption 
        1 
        0.1% 
        
             
         
 
        Analysis of existing databases at the logical level: the DBA companion project 
        1 
        0.1% 
        
             
         
 
        Using Versions in Update Transactions: Application to Integrity Checking 
        1 
        0.1% 
        
             
         
 
        Power efficient data gathering and aggregation in wireless sensor networks 
        1 
        0.1% 
        
             
         
 
        Instance-based attribute identification in database integration 
        1 
        0.1% 
        
             
         
 
        Other values (1787) 
        1787 
        99.3% 
        
             
         
 
    
        venue
            Categorical
        
    
    
        
            Distinct count 
            5 
         
        
            Unique (%) 
            0.3% 
         
        
            Missing (%) 
            0.0% 
         
        
            Missing (n) 
            0 
         
    
    
        
    SIGMOD Conference 
    
        
            654
        
        
     
 
    VLDB 
    
        
            512
        
        
     
 
    SIGMOD Record 
    
        
            381
        
        
     
 
    Other values (2) 
    
        
            253
        
        
     
 
    
    
    
    
        Value 
        Count 
        Frequency (%) 
          
     
    
    
        SIGMOD Conference 
        654 
        36.3% 
        
             
         
 
        VLDB 
        512 
        28.4% 
        
             
         
 
        SIGMOD Record 
        381 
        21.2% 
        
             
         
 
        VLDB J. 
        146 
        8.1% 
        
             
         
 
        ACM Trans. Database Syst. 
        107 
        5.9% 
        
             
         
 
    
        year
            Numeric
        
    
    
        
            
                
                    Distinct count 
                    10 
                 
                
                    Unique (%) 
                    0.6% 
                 
                
                    Missing (%) 
                    0.0% 
                 
                
                    Missing (n) 
                    0 
                 
                
                    Infinite (%) 
                    0.0% 
                 
                
                    Infinite (n) 
                    0 
                 
            
        
        
            
                
                    Mean 
                    1998.4 
                 
                
                    Minimum 
                    1994 
                 
                
                    Maximum 
                    2003 
                 
                
                    Zeros (%) 
                    0.0% 
                 
            
        
    
    
 
    
    
        
            
                Quantile statistics
                
                    
                        Minimum 
                        1994 
                     
                    
                        5-th percentile 
                        1994 
                     
                    
                        Q1 
                        1996 
                     
                    
                        Median 
                        1998 
                     
                    
                        Q3 
                        2001 
                     
                    
                        95-th percentile 
                        2003 
                     
                    
                        Maximum 
                        2003 
                     
                    
                        Range 
                        9 
                     
                    
                        Interquartile range 
                        5 
                     
                
            
            
                Descriptive statistics
                
                    
                        Standard deviation 
                        2.8231 
                     
                    
                        Coef of variation 
                        0.0014127 
                     
                    
                        Kurtosis 
                        -1.2004 
                     
                    
                        Mean 
                        1998.4 
                     
                    
                        MAD 
                        2.4525 
                     
                    
                        Skewness 
                        -0.007014 
                     
                    
                        Sum 
                        3597166 
                     
                    
                        Variance 
                        7.97 
                     
                    
                        Memory size 
                        14.1 KiB 
                     
                
            
        
        
             
        
        
            
    
    
        Value 
        Count 
        Frequency (%) 
          
     
    
    
        2001 
        218 
        12.1% 
        
             
         
 
        1998 
        194 
        10.8% 
        
             
         
 
        2000 
        191 
        10.6% 
        
             
         
 
        1995 
        188 
        10.4% 
        
             
         
 
        1996 
        182 
        10.1% 
        
             
         
 
        1994 
        182 
        10.1% 
        
             
         
 
        1999 
        176 
        9.8% 
        
             
         
 
        1997 
        164 
        9.1% 
        
             
         
 
        2003 
        154 
        8.6% 
        
             
         
 
        2002 
        151 
        8.4% 
        
             
         
 
        
        
            Minimum 5 values
            
    
    
        Value 
        Count 
        Frequency (%) 
          
     
    
    
        1994 
        182 
        10.1% 
        
             
         
 
        1995 
        188 
        10.4% 
        
             
         
 
        1996 
        182 
        10.1% 
        
             
         
 
        1997 
        164 
        9.1% 
        
             
         
 
        1998 
        194 
        10.8% 
        
             
         
 
            Maximum 5 values
            
    
    
        Value 
        Count 
        Frequency (%) 
          
     
    
    
        1999 
        176 
        9.8% 
        
             
         
 
        2000 
        191 
        10.6% 
        
             
         
 
        2001 
        218 
        12.1% 
        
             
         
 
        2002 
        151 
        8.4% 
        
             
         
 
        2003 
        154 
        8.6% 
        
             
         
 
        
    
    
        Sample
    
    
    
        
  
    
       
      id 
      title 
      authors 
      venue 
      year 
     
  
  
    
      0 
      l0 
      Paradise: A Database System for GIS Applications 
      Paradise Team 
      SIGMOD Conference 
      1995 
     
    
      1 
      l1 
      A Query Language and Optimization Techniques for Unstructured Data 
      Gerd G. Hillebrand, Peter Buneman, Susan B. Davidson, Dan Suciu 
      SIGMOD Conference 
      1996 
     
    
      2 
      l2 
      Turbo-charging Vertical Mining of Large Databases 
      Jayant R. Haritsa, Devavrat Shah, S. Sudarshan, Pradeep Shenoy, Mayank Bawa, Gaurav Bhalotia 
      SIGMOD Conference 
      2000 
     
    
      3 
      l3 
      Maintenance of Data Cubes and Summary Tables in a Warehouse 
      Inderpal Singh Mumick, Dallan Quass, Barinderpal Singh Mumick 
      SIGMOD Conference 
      1997 
     
    
      4 
      l4 
      On Relational Support for XML Publishing: Beyond Sorting and Tagging 
      Raghav Kaushik, Jeffrey F. Naughton, Surajit Chaudhuri 
      SIGMOD Conference 
      2003 
     
  
    
In [6]:
    
pfr = pandas_profiling.ProfileReport(A)
pfr.to_file("/tmp/example.html")
    
In [7]:
    
pfr
    
    Out[7]:
    
        Overview
    
    
    
        Dataset info
        
            
            
                Number of variables 
                5  
             
            
                Number of observations 
                1800  
             
            
                Total Missing (%) 
                0.0%  
             
            
                Total size in memory 
                70.4 KiB  
             
            
                Average record size in memory 
                40.0 B  
             
            
        
    
    
        Variables types
        
            
            
                Numeric 
                1  
             
            
                Categorical 
                3  
             
            
                Date 
                0  
             
            
                Text (Unique) 
                1  
             
            
                Rejected 
                0  
             
            
        
    
    
        Warnings
        authors has a high cardinality: 1703 distinct values  Warningtitle has a high cardinality: 1797 distinct values  Warning 
    
    
        Variables
    
    
    
        authors
            Categorical
        
    
    
        
            Distinct count 
            1703 
         
        
            Unique (%) 
            94.6% 
         
        
            Missing (%) 
            0.0% 
         
        
            Missing (n) 
            0 
         
    
    
        
    Dan Suciu 
    
        
             
        
        7
     
 
    C. Mohan 
    
        
             
        
        6
     
 
    Andrew Eisenberg, Jim Melton 
    
        
             
        
        5
     
 
    Other values (1700) 
    
        
            1782
        
        
     
 
    
    
    
    
        Value 
        Count 
        Frequency (%) 
          
     
    
    
        Dan Suciu 
        7 
        0.4% 
        
             
         
 
        C. Mohan 
        6 
        0.3% 
        
             
         
 
        Andrew Eisenberg, Jim Melton 
        5 
        0.3% 
        
             
         
 
        Xiaolei Qian 
        5 
        0.3% 
        
             
         
 
        Joseph M. Hellerstein 
        4 
        0.2% 
        
             
         
 
        Richard T. Snodgrass 
        4 
        0.2% 
        
             
         
 
        Praveen Seshadri 
        3 
        0.2% 
        
             
         
 
        H. V. Jagadish 
        3 
        0.2% 
        
             
         
 
        Nam Huyn 
        3 
        0.2% 
        
             
         
 
        Viswanath Poosala, Yannis E. Ioannidis 
        3 
        0.2% 
        
             
         
 
        Other values (1693) 
        1757 
        97.6% 
        
             
         
 
    
        id
            Categorical, Unique
        
    
  
    
      First 3 values 
     
  
  
    
      l415 
     
    
      l1574 
     
    
      l1364 
     
  
  
    
      Last 3 values 
     
  
  
    
      l492 
     
    
      l273 
     
    
      l92 
     
  
    First 10 values
    
    
    
        Value 
        Count 
        Frequency (%) 
          
     
    
    
        l0 
        1 
        0.1% 
        
             
         
 
        l1 
        1 
        0.1% 
        
             
         
 
        l10 
        1 
        0.1% 
        
             
         
 
        l100 
        1 
        0.1% 
        
             
         
 
        l1000 
        1 
        0.1% 
        
             
         
 
    Last 10 values
    
    
    
        Value 
        Count 
        Frequency (%) 
          
     
    
    
        l995 
        1 
        0.1% 
        
             
         
 
        l996 
        1 
        0.1% 
        
             
         
 
        l997 
        1 
        0.1% 
        
             
         
 
        l998 
        1 
        0.1% 
        
             
         
 
        l999 
        1 
        0.1% 
        
             
         
 
    
        title
            Categorical
        
    
    
        
            Distinct count 
            1797 
         
        
            Unique (%) 
            99.8% 
         
        
            Missing (%) 
            0.0% 
         
        
            Missing (n) 
            0 
         
    
    
        
    Editorial 
    
        
             
        
        2
     
 
    Guest editorial 
    
        
             
        
        2
     
 
    Keynote Address 
    
        
             
        
        2
     
 
    Other values (1794) 
    
        
            1794
        
        
     
 
    
    
    
    
        Value 
        Count 
        Frequency (%) 
          
     
    
    
        Editorial 
        2 
        0.1% 
        
             
         
 
        Guest editorial 
        2 
        0.1% 
        
             
         
 
        Keynote Address 
        2 
        0.1% 
        
             
         
 
        Integrating Modelling Systems for Environmental Management Information Systems 
        1 
        0.1% 
        
             
         
 
        Historical Queries Along Multiple Lines of Time Evolution 
        1 
        0.1% 
        
             
         
 
        Selectivity Estimation Without the Attribute Value Independence Assumption 
        1 
        0.1% 
        
             
         
 
        Analysis of existing databases at the logical level: the DBA companion project 
        1 
        0.1% 
        
             
         
 
        Using Versions in Update Transactions: Application to Integrity Checking 
        1 
        0.1% 
        
             
         
 
        Power efficient data gathering and aggregation in wireless sensor networks 
        1 
        0.1% 
        
             
         
 
        Instance-based attribute identification in database integration 
        1 
        0.1% 
        
             
         
 
        Other values (1787) 
        1787 
        99.3% 
        
             
         
 
    
        venue
            Categorical
        
    
    
        
            Distinct count 
            5 
         
        
            Unique (%) 
            0.3% 
         
        
            Missing (%) 
            0.0% 
         
        
            Missing (n) 
            0 
         
    
    
        
    SIGMOD Conference 
    
        
            654
        
        
     
 
    VLDB 
    
        
            512
        
        
     
 
    SIGMOD Record 
    
        
            381
        
        
     
 
    Other values (2) 
    
        
            253
        
        
     
 
    
    
    
    
        Value 
        Count 
        Frequency (%) 
          
     
    
    
        SIGMOD Conference 
        654 
        36.3% 
        
             
         
 
        VLDB 
        512 
        28.4% 
        
             
         
 
        SIGMOD Record 
        381 
        21.2% 
        
             
         
 
        VLDB J. 
        146 
        8.1% 
        
             
         
 
        ACM Trans. Database Syst. 
        107 
        5.9% 
        
             
         
 
    
        year
            Numeric
        
    
    
        
            
                
                    Distinct count 
                    10 
                 
                
                    Unique (%) 
                    0.6% 
                 
                
                    Missing (%) 
                    0.0% 
                 
                
                    Missing (n) 
                    0 
                 
                
                    Infinite (%) 
                    0.0% 
                 
                
                    Infinite (n) 
                    0 
                 
            
        
        
            
                
                    Mean 
                    1998.4 
                 
                
                    Minimum 
                    1994 
                 
                
                    Maximum 
                    2003 
                 
                
                    Zeros (%) 
                    0.0% 
                 
            
        
    
    
 
    
    
        
            
                Quantile statistics
                
                    
                        Minimum 
                        1994 
                     
                    
                        5-th percentile 
                        1994 
                     
                    
                        Q1 
                        1996 
                     
                    
                        Median 
                        1998 
                     
                    
                        Q3 
                        2001 
                     
                    
                        95-th percentile 
                        2003 
                     
                    
                        Maximum 
                        2003 
                     
                    
                        Range 
                        9 
                     
                    
                        Interquartile range 
                        5 
                     
                
            
            
                Descriptive statistics
                
                    
                        Standard deviation 
                        2.8231 
                     
                    
                        Coef of variation 
                        0.0014127 
                     
                    
                        Kurtosis 
                        -1.2004 
                     
                    
                        Mean 
                        1998.4 
                     
                    
                        MAD 
                        2.4525 
                     
                    
                        Skewness 
                        -0.007014 
                     
                    
                        Sum 
                        3597166 
                     
                    
                        Variance 
                        7.97 
                     
                    
                        Memory size 
                        14.1 KiB 
                     
                
            
        
        
             
        
        
            
    
    
        Value 
        Count 
        Frequency (%) 
          
     
    
    
        2001 
        218 
        12.1% 
        
             
         
 
        1998 
        194 
        10.8% 
        
             
         
 
        2000 
        191 
        10.6% 
        
             
         
 
        1995 
        188 
        10.4% 
        
             
         
 
        1996 
        182 
        10.1% 
        
             
         
 
        1994 
        182 
        10.1% 
        
             
         
 
        1999 
        176 
        9.8% 
        
             
         
 
        1997 
        164 
        9.1% 
        
             
         
 
        2003 
        154 
        8.6% 
        
             
         
 
        2002 
        151 
        8.4% 
        
             
         
 
        
        
            Minimum 5 values
            
    
    
        Value 
        Count 
        Frequency (%) 
          
     
    
    
        1994 
        182 
        10.1% 
        
             
         
 
        1995 
        188 
        10.4% 
        
             
         
 
        1996 
        182 
        10.1% 
        
             
         
 
        1997 
        164 
        9.1% 
        
             
         
 
        1998 
        194 
        10.8% 
        
             
         
 
            Maximum 5 values
            
    
    
        Value 
        Count 
        Frequency (%) 
          
     
    
    
        1999 
        176 
        9.8% 
        
             
         
 
        2000 
        191 
        10.6% 
        
             
         
 
        2001 
        218 
        12.1% 
        
             
         
 
        2002 
        151 
        8.4% 
        
             
         
 
        2003 
        154 
        8.6% 
        
             
         
 
        
    
    
        Sample
    
    
    
        
  
    
       
      id 
      title 
      authors 
      venue 
      year 
     
  
  
    
      0 
      l0 
      Paradise: A Database System for GIS Applications 
      Paradise Team 
      SIGMOD Conference 
      1995 
     
    
      1 
      l1 
      A Query Language and Optimization Techniques for Unstructured Data 
      Gerd G. Hillebrand, Peter Buneman, Susan B. Davidson, Dan Suciu 
      SIGMOD Conference 
      1996 
     
    
      2 
      l2 
      Turbo-charging Vertical Mining of Large Databases 
      Jayant R. Haritsa, Devavrat Shah, S. Sudarshan, Pradeep Shenoy, Mayank Bawa, Gaurav Bhalotia 
      SIGMOD Conference 
      2000 
     
    
      3 
      l3 
      Maintenance of Data Cubes and Summary Tables in a Warehouse 
      Inderpal Singh Mumick, Dallan Quass, Barinderpal Singh Mumick 
      SIGMOD Conference 
      1997 
     
    
      4 
      l4 
      On Relational Support for XML Publishing: Beyond Sorting and Tagging 
      Raghav Kaushik, Jeffrey F. Naughton, Surajit Chaudhuri 
      SIGMOD Conference 
      2003 
     
  
    
In [ ]:
    
    
Content source: anhaidgroup/py_entitymatching
Similar notebooks: